The analysis will use the dataset GSE20437 obtained from GEO.The dataset is generated from Affymetrix HU133A microarrays and contains 42 tissue samples.
In detail, the data includes:
18 reduction mammoplasty (RM) breast epithelium samples,
18 histologically normal (HN) epithelial samples from breast cancer patients (9 ER+ and 9 ER-), and
6 histologically normal epithelial samples from prophylactic mastectomy patients.
Note that sample numbers correspond to individual patient samples.
# download the GSE20437 expression data series
#gse <- getGEO("GSE20437", destdir= './data/', getGPL = F)
# load the local copy of the data
gse <- getGEO(file = "./data/GSE20437_series_matrix.txt.gz", getGPL = FALSE)
# getGEO returns a list of expression objects, but...
length(gse)
## [1] 1
# shows us there is only one object in it.
# We assign it to the same variable.
#gse <- gse[[1]] # run only if you download data and
# if you don't use the local copy
# extract metadata
metadata <- data.frame(gse@phenoData@data)
expr(gse[1])
## gse[1]
For the later analysis, it is useful to annotate the probe sets of the GEO data set to gene symbol. To do that, first we extract the name of the probes and then, we perform the annotation using biomaRt.
id_to_annotate <- rownames(gse@assayData[["exprs"]])
# extract id to annotate
# annotation
#mart <- useMart("ENSEMBL_MART_ENSEMBL")
#mart <- useDataset("hsapiens_gene_ensembl", mart)
#annotLookup <- getBM(
# mart = mart,
# attributes = c(
# "affy_hg_u133_plus_2",
# "ensembl_gene_id",
# "gene_biotype",
# "external_gene_name",
# "description"),
# filter = "affy_hg_u133_plus_2",
# values = id_to_annotate,
# uniqueRows=TRUE)
#saveRDS(annotLookup, file = "output/annotLookup.rds")
# load the local copy to use when biomaRt is unavailable
annotLookup <- readRDS("output/annotLookup.rds")
head(annotLookup)
indicesLookup <- match(rownames(gse@assayData[["exprs"]]), annotLookup$affy_hg_u133_plus_2)
#head(annotLookup[indicesLookup, "external_gene_name"])
dftmp <- data.frame(rownames(gse@assayData[["exprs"]]),
annotLookup[indicesLookup,
c("affy_hg_u133_plus_2", "external_gene_name")])
head(dftmp, 20)
length(rownames(gse@assayData[["exprs"]]))
## [1] 22283
table(dftmp[,1] == dftmp[,2])
##
## TRUE
## 20259
p <- ggplot(metadata, aes(x=disease.state.ch1, fill=specimen.ch1))+
geom_bar()+
scale_fill_manual(values = my_colors[c(1,4,6,7)])
p + labs(x = "Group")
# show what we have:
show(gse)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 22283 features, 42 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM512539 GSM512540 ... GSM512580 (42 total)
## varLabels: title geo_accession ... tissue:ch1 (38 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 20197764
## Annotation: GPL96
The actual expression data are accessible in the exprs
section of gse, an Expression Set and the generic data
class that BioConductor uses for expression data.
head(exprs(gse))
## GSM512539 GSM512540 GSM512541 GSM512542 GSM512543 GSM512544 GSM512545 GSM512546 GSM512547 GSM512548 GSM512549
## 1007_s_at 2461.4 3435.7 1932.5 2377.7 3055.3 2978.1 2348.5 2963.9 2776.9 3088.9 3033.3
## 1053_at 26.7 159.0 31.2 140.7 69.9 98.5 37.0 59.9 86.7 107.2 64.0
## 117_at 82.6 243.4 150.2 95.1 209.3 103.4 91.2 168.4 162.7 203.2 143.7
## 121_at 942.3 897.5 840.8 870.9 685.4 791.8 886.5 954.2 843.1 775.3 847.6
## 1255_g_at 71.8 87.9 75.4 58.1 31.8 40.3 70.5 43.3 51.6 42.6 74.9
## 1294_at 630.2 571.4 346.3 679.9 1289.3 421.1 417.6 811.6 778.1 393.2 995.4
## GSM512550 GSM512551 GSM512552 GSM512553 GSM512554 GSM512555 GSM512556 GSM512557 GSM512558 GSM512559 GSM512560
## 1007_s_at 3037.1 3545.8 3322.6 1963.7 3609.6 2078.9 4138.6 4260.7 2453.6 2709.0 2612.5
## 1053_at 82.9 97.7 69.7 82.0 45.6 84.5 31.7 37.4 82.4 204.8 119.3
## 117_at 113.5 80.0 186.4 106.6 145.6 144.4 133.6 278.6 173.0 147.8 186.0
## 121_at 912.2 911.6 862.4 705.0 984.6 853.8 846.8 1273.0 833.6 908.1 806.2
## 1255_g_at 53.7 30.5 15.2 42.5 76.6 88.2 90.6 65.8 25.8 77.5 84.3
## 1294_at 987.7 938.5 924.6 480.8 1054.1 632.0 448.0 1345.2 1248.9 405.7 647.5
## GSM512561 GSM512562 GSM512563 GSM512564 GSM512565 GSM512566 GSM512567 GSM512568 GSM512569 GSM512570 GSM512571
## 1007_s_at 4340.1 3155.3 2390.3 2738.8 3233.1 2836.6 2915.4 3457.5 2798.7 4370.2 2467.3
## 1053_at 76.7 100.3 115.4 14.1 47.6 77.1 47.1 47.0 83.2 40.2 80.3
## 117_at 168.0 95.2 73.6 122.7 107.6 120.9 143.4 92.5 72.1 131.8 156.4
## 121_at 827.0 629.4 709.2 305.6 877.4 425.7 643.8 771.3 681.1 812.7 533.4
## 1255_g_at 87.9 44.6 59.3 12.0 82.1 59.2 62.2 28.3 97.6 8.1 17.9
## 1294_at 2218.1 1321.1 606.7 1709.9 980.8 1268.4 955.8 1157.5 888.6 1130.8 905.1
## GSM512572 GSM512573 GSM512574 GSM512575 GSM512576 GSM512577 GSM512578 GSM512579 GSM512580
## 1007_s_at 3669.5 3310.1 3942.2 4520.4 3596.1 2989.0 3164.5 2764.3 4258.5
## 1053_at 24.1 8.8 44.6 54.7 56.7 89.9 63.4 57.0 59.5
## 117_at 165.8 141.6 97.1 132.7 124.3 210.5 131.4 89.6 123.3
## 121_at 746.9 1090.3 1008.7 718.6 988.4 295.9 957.3 630.8 869.2
## 1255_g_at 53.0 39.9 11.0 50.2 60.0 34.3 33.5 61.7 50.4
## 1294_at 1138.5 483.0 1326.5 1179.4 668.3 863.2 1055.5 1287.6 1127.8
To conveniently access the data rows and columns present in
exprs(gse), this matrix is assigned to its own variable
ex.
# exprs (gse) is a matrix that we can assign to its own variable, to
# conveniently access the data rows and columns
ex <- exprs(gse)
dim(ex) # 42 sample, 22283 genes
## [1] 22283 42
The dataset contains gene expression data of 22283 genes (rows) from 42 patients (columns).
# Analyze value distributions
boxplot(ex, main = 'Boxplot of the data before normalization',
xlab = "Samples",
ylab = "Expression Value",
varwidth = TRUE
)
Boxplot of the data before normalization
The boxplot shows that scaling is necessary. So, in this case, I try to apply a log transformation to the data.
ex2<-log(ex)
ex2 <- na.omit(as.matrix(ex2))
#dim(ex2) # 22283 42 same as before
boxplot(ex2, main = 'Boxplot of the data after applying a logarithmic transformation',
xlab = "Samples",
ylab = "Expression Value"
)
Boxplot of the data after applying a logarithmic transformation
From the boxplot after the log transformation, I can see that there is some variation in the median of the samples. So, one of the simplest normalization strategies is to align the log values so that all channels have the same median. A convenient choice is zero so that positive or negative values reflect signals above or below the median for a particular channel.
normalized.log.ex=scale(log(ex))
# boxplot post median normalization
boxplot(normalized.log.ex,
main = 'Boxplot of the data after median normalization',
xlab = "Samples",
ylab = "Expression Value")
Boxplot of the data after normalization
PCA is a dimensionality reduction technique that allows to condense thousands of dimensions into just two or three.For the dataset’s samples, the PCA scores display the coordinates in relation to these additional dimensions.
pca <- prcomp(t(normalized.log.ex))
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
## Standard deviation 21.6856 12.78817 11.18542 10.68629 10.28318 9.05796 8.85994 8.67632 8.31063 7.98169 7.80143
## Proportion of Variance 0.1798 0.06253 0.04783 0.04366 0.04043 0.03137 0.03001 0.02878 0.02641 0.02436 0.02327
## Cumulative Proportion 0.1798 0.24232 0.29016 0.33382 0.37425 0.40561 0.43563 0.46441 0.49081 0.51517 0.53844
## PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23
## Standard deviation 7.64884 7.42525 7.35447 7.25191 7.12698 7.03899 6.96990 6.86700 6.81828 6.7261 6.71031 6.59828
## Proportion of Variance 0.02237 0.02108 0.02068 0.02011 0.01942 0.01894 0.01857 0.01803 0.01777 0.0173 0.01722 0.01665
## Cumulative Proportion 0.56081 0.58189 0.60257 0.62267 0.64209 0.66104 0.67961 0.69764 0.71541 0.7327 0.74993 0.76657
## PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 6.51084 6.39868 6.39565 6.35069 6.16713 6.14788 6.07658 6.01840 5.86441 5.80650 5.71226 5.62268
## Proportion of Variance 0.01621 0.01565 0.01564 0.01542 0.01454 0.01445 0.01412 0.01385 0.01315 0.01289 0.01248 0.01209
## Cumulative Proportion 0.78278 0.79843 0.81407 0.82949 0.84403 0.85848 0.87260 0.88645 0.89960 0.91249 0.92496 0.93705
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 5.46227 5.35021 5.28516 5.20027 5.10709 5.01262 4.456e-14
## Proportion of Variance 0.01141 0.01094 0.01068 0.01034 0.00997 0.00961 0.000e+00
## Cumulative Proportion 0.94846 0.95940 0.97008 0.98042 0.99039 1.00000 1.000e+00
#screeplot(pca)
To get the summary of the PCA and the plot showing the variance explained by the first 10 components, it is possible to use the functions commented in the chunks above.
However, using ggplot2 and factoextra
packages is possible to get a more concise and informative plot
reporting the same information.
pcaVar <- get_eig(pca)
pcaVar <- pcaVar$variance.percent[1:10]
screeDf <- data.frame("Dimensions" = as.factor(seq(1,10)),
"Percentages" = pcaVar,
"Labels" = paste(round(pcaVar, 2), "%"))
p <- ggplot(data = screeDf, aes(x=Dimensions, y=Percentages))+
geom_bar(stat = "identity", fill = "#d1105a")+
geom_text(aes(label=Labels), vjust=-0.5, color="black", size=3.6)+
ggtitle("Scree Plot")+
ylab("Percentage of variance explained")+
scale_x_discrete(labels = as.factor(seq(1,10)))
p
Scree Plot
The scree plot shows that the first dimensions on the left are the more important because the percentage of variance explained by them is higher. The remaining principal components account for a very small proportion of the variability and are probably unimportant.
Let’s try to plot the PCA, looking if we can see a separation between Control and Breast Cancer groups.
# draw PCA plot control VS breast cancer
group <- c(rep("cadetblue1",18), rep("red",18), rep("cadetblue1",6) )
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1 and 2", type="p", pch=10, col=group)
text(pca$x[,1], pca$x[,2], rownames(pca$data), cex=0.75)
legend("topleft", col=c("cadetblue1","red"), legend = c("Control", "Breast Cancer"),
pch = 20, bty='n', cex=.55)
PCA analysis of 2 components showing breast cancer and control samples.
Let’s try to add the control subtypes. The vector group used in the PCA plot is based on the data. The samples corresponding to the colors are the following:
Light blue: reduction mammoplasty (RM) breast epithelium samples
Red: histologically normal (HN) epithelial samples from breast cancer patient
Purple: histologically normal breast epithelium (NlEpi) from prophylactic mastectomy patient samples
# draw PCA plot
group <- c(rep("cadetblue1",18), rep("red",18), rep("purple",6) ) # vector of colors based on the order of my data
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1 and 2", type="p", pch=10, col=group)
text(pca$x[,1], pca$x[,2], rownames(pca$data), cex=0.75)
legend("topleft", col=c("cadetblue1","red","purple"), legend = c("Reduction Mammoplasty", "Breast Cancer", "Prophylactic Mastectomy"),
pch = 20, bty='n', cex=.55)
PCA analysis of 2 components showing breast cancer and the two subtypes of control samples.
Then, I try to see if there is a separation also inside different types of Breast Cancer.
# draw PCA plot with all subtypes
group <- c(rep(my_colors[7],18), rep(my_colors[4],9), rep(my_colors[1],9), rep(my_colors[6],6) ) # vector of colors based on the order of my data
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1 and 2", type="p", pch=10, col=group)
text(pca$x[,1], pca$x[,2], rownames(pca$data), cex=0.75)
legend("topleft", col=c(my_colors[7],my_colors[4],my_colors[1],my_colors[6]), legend = c("Reduction Mammoplasty", "ER+ Breast Cancer", "ER- Breast Cancer", "Prophylactic Mastectomy"),
pch = 20, bty='n', cex=.55)
PCA analysis of 2 components, showing all samples specimen.
Let’s try to explore an interactive PCA plot.
components<-pca[["x"]]
components<-data.frame(components)
type<-c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
components<-cbind(components, type )
fig <- plot_ly(components, x=~PC1, y=~PC2,
color=type,colors=c('cadetblue1', 'red','purple'),
type='scatter',mode='markers')
fig
fig2 <- plot_ly(components, x=~PC1, y=~PC2, z=~PC3,
color=type, colors=c('cadetblue1', 'red','purple'),
mode='markers', marker = list(size = 4))
fig2
fig3 <- plot_ly(components, x=~PC1, y=~PC3,
color=type, colors=c('cadetblue1', 'red','purple'),
type='scatter',mode='markers')
fig3
set.seed(123)
umap_result <- umap(t(normalized.log.ex))
umap_df <- as.data.frame(umap_result$layout)
colnames(umap_df) <- c("UMAP1", "UMAP2")
# Add sample types to the UMAP data frame
umap_df$type <- c(rep("RM", 18), rep("HN",18), rep("NlEpi",6))
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = type)) +
geom_point(size = 3, alpha = 0.8) +
scale_color_manual(values = c("RM" = "cadetblue1",
"HN" = "red",
"NlEpi" = "purple")) +
theme_minimal() +
labs(title = "2D UMAP Projection of GSE20437", x = "UMAP1", y = "UMAP2")
2D UMAP
K-means is an unsupervised method that relies on the tuning parameter \(k\) (number of resulting clusters). This method is sensitive to outliers, so it is used for a first exploratory analysis.
Let’s try to compute K-means by setting \(k=2\), since we know that there are 2 groups: breast cancer and control.
set.seed(1)
k <- 2 # number of clusters
kmeans_result2 <- kmeans(t(normalized.log.ex),k)
table(kmeans_result2$cluster) # tells how many samples were assigned to each cluster
##
## 1 2
## 14 28
The results are plotted with a PCA for a better visualization.
# Create a data frame for plotting
cluster_df_k2 <- data.frame(
PC1 = pca$x[,1],
PC2 = pca$x[,2],
Cluster = as.factor(kmeans_result2$cluster), # Cluster from K-means
Disease = metadata$disease.state.ch1 # Disease state (Breast cancer or not)
)
enhanced_colors <- c("#1f77b4", "#ff7f0e")
ggplot(cluster_df_k2, aes(x = PC1, y = PC2, color = Cluster, shape = Disease)) +
geom_point(size = 3, alpha = 0.8) +
scale_color_manual(values = enhanced_colors) + # Color code for clusters
scale_shape_manual(values = c(16, 17)) + # Shape code for disease state
labs(title = "K-means Clustering of PCA Components",
x = "PCA1", y = "PCA2",
color = "Cluster", shape = "Disease State") +
theme_minimal() +
theme(legend.position = "top")
K-means clustering of PCA components 1 and 2 breast cancer and control samples.
## Create list with the predictions
pred_k <- kmeans_result2$cluster
pred_k[which(pred_k==1)] <- "control"
pred_k[which(pred_k==2)] <- "breast cancer"
pred_k <- factor(pred_k,levels = c("control", "breast cancer"))
true_labels <- ifelse(metadata$disease.state.ch1 == "breast cancer", 1, 0)
pred_labels <- ifelse(pred_k == "breast cancer", 1, 0)
table(true_labels)
## true_labels
## 0 1
## 24 18
table(pred_labels)
## pred_labels
## 0 1
## 14 28
# Compute ROC
roc_k <- roc(response = true_labels, predictor = pred_labels)
auc_k <- auc(roc_k)
## Accuracy
acc_k <- mean(pred_k==metadata$disease.state.ch1)
## Crete dataframe to store the accuracy results
res_df <- data.frame(Kmeans = c(acc_k, auc_k))
rownames(res_df) <- c("Accuracy", "AUC")
plot(roc_k, main = paste("ROC Curve (AUC =", round(auc_k, 3), ")"), col = "#d62728", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "gray")
ROC curve K-means
Let’s try to increase the number of clusters to 4, as we know that there are 4 subtypes. The control group contains both Reduction mammoplasty (RM) breast epithelium samples and histologically normal breast epithelium (NlEpi) from prophylactic mastectomy patient samples; while the breast cancer group contains histologically normal (HN) epithelial samples from breast cancer patient both with ER+ and ER-.
# K-means clustering with k = 4
set.seed(1)
k <- 4 # Number of clusters
kmeans_result_k4 <- kmeans(t(normalized.log.ex), k)
table(kmeans_result_k4$cluster)
##
## 1 2 3 4
## 4 7 10 21
# Create a data frame for plotting
cluster_df_k4 <- data.frame(
PC1 = pca$x[,1],
PC2 = pca$x[,2],
Cluster = as.factor(kmeans_result_k4$cluster), # Cluster from K-means
Disease = metadata$disease.state.ch1, # Disease state (Breast cancer or not)
Specimen = metadata$specimen.ch1 # Specimen type (e.g., control, cancer)
)
enhanced_colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728")
shape_values <- c(15, 16, 17, 18) # Square, Circle, Triangle, Diamond
ggplot(cluster_df_k4, aes(x = PC1, y = PC2, color = Cluster, shape = Specimen)) +
geom_point(size = 4, alpha = 0.8) +
scale_color_manual(values = enhanced_colors) +
scale_shape_manual(values = shape_values) + # Different shapes for specimen types
labs(title = "K-means Clustering of PCA Components (k=4)",
x = "PCA1", y = "PCA2",
color = "Cluster", shape = "Specimen Type") +
theme_minimal() +
theme(legend.position = "top")
K-means clustering of PCA components 1 and 2 with k=4.
Hierarchical clustering produces a dendrogram. At each step a distance matrix is computed and the points with the lower distance are clustered together. Then, the distance matrix is recomputed considering the new cluster as one point. In the end, the tree is cut to have the chosen number of clusters.
For this clustering method, distance between samples and between clusters must be computed. In this case, sample distances are computed with the Euclidean method, and the cluster distances with average linkage.
dist_matrix <- dist(t(normalized.log.ex), method = "euclidean")
hc <- hclust(dist_matrix, method = "average")
tree <- as.phylo(hc)
k_hc <- 2
hclusters <- cutree(hc, k = k_hc)
cluster_df <- data.frame(
sample = names(hclusters),
cluster = factor(hclusters)
)
annot <- data.frame(
sample = colnames(normalized.log.ex), # samples are columns
disease = metadata$disease.state.ch1,
specimen = metadata$specimen.ch1
)
tip_data <- merge(cluster_df, annot, by.x = "sample", by.y = "sample")
# Plot with samples' name
ggtree(tree, layout = "rectangular", branch.length = "none") %<+% annot +
# Tip points colored by disease, shaped by specimen
geom_tippoint(aes(color = disease, shape = specimen), size = 2) +
scale_color_manual(values = c("control" = "#1b9e77", "breast cancer" = "#d95f02")) +
scale_shape_manual(values = c(
"Reduction Mammoplasty" = 15,
"ER+ Breast Cancer" = 16,
"ER- Breast Cancer" = 17,
"Prophylactic Mastectomy" = 18
)) +
geom_tiplab(size = 1.7, angle = 85, hjust = 0.5, offset=-0.5)+ # name samples
theme_tree2() +
coord_flip() + scale_x_reverse() +
ggtitle("Hierarchical Clustering Dendrogram") +
theme(legend.position = "right")
Hierarchical clustering dendrogram. The sample distances are computed with Euclidean distance, while the cluster distance with average linkage.
# Plot without samples' name
ggtree(tree, layout = "rectangular", branch.length = "none") %<+% annot +
# Tip points colored by disease, shaped by specimen
geom_tippoint(aes(color = disease, shape = specimen), size = 2) +
scale_color_manual(values = c("control" = "#1b9e77", "breast cancer" = "#d95f02")) +
scale_shape_manual(values = c(
"Reduction Mammoplasty" = 15,
"ER+ Breast Cancer" = 16,
"ER- Breast Cancer" = 17,
"Prophylactic Mastectomy" = 18
)) +
#geom_tiplab(size = 1.7, angle = 85, hjust = 0.5, offset=-0.5)+ # name samples
theme_tree2() +
coord_flip() + scale_x_reverse() +
ggtitle("Hierarchical Clustering Dendrogram") +
theme(legend.position = "right")
Hierarchical clustering dendrogram without samples’ labels. The sample distances are computed with Euclidean distance, while the cluster distance with average linkage.
# Predictions (label clusters as "control" and "breast cancer")
pred_h <- hclusters
cluster1_majority <- names(which.max(table(pred_h[metadata$disease.state.ch1 == "control"])))
pred_h <- ifelse(hclusters == as.numeric(cluster1_majority), "control", "breast cancer")
pred_h <- factor(pred_h, levels = c("control", "breast cancer"))
# Accuracy
acc_h <- mean(pred_h == metadata$disease.state.ch1)
# AUC from ROC curve
true_labels_h <- ifelse(metadata$disease.state.ch1 == "breast cancer", 1, 0)
pred_labels_h <- ifelse(pred_h == "breast cancer", 1, 0)
roc_h <- roc(response = true_labels_h, predictor = pred_labels_h)
auc_h <- auc(roc_h)
# Update results table
res_df["Hierarchical"] <- c(acc_h, auc_h)
Supervised methods require the data to be divided into a training and a test set.
set.seed(1)
# Extract expression data
expr_data <- t(normalized.log.ex) # samples are rows
labels <- metadata$disease.state.ch1 # A factor with "control" and "breast cancer"
# 70% train, 30% test
train_index <- createDataPartition(labels, p = 0.7, list = FALSE)
# Split data
train_data <- expr_data[train_index, ]
test_data <- expr_data[-train_index, ]
train_labels <- labels[train_index]
test_labels <- labels[-train_index]
Random forest requires the tuning of 2 parameters: ntree
and the mtry. The first is the number of trees to grow and
the second is the number of features (genes) considered when building
each tree. The best parameters are chosen comparing OOB errors of random
forests fitted with different values of the two parameters.
set.seed(1234)
rf <- randomForest(x=train_data, y=as.factor(train_labels), ntree = 600)
plot(rf, main = "Random Forest")
Plot showing how the error rate changes according to the number of trees.
The plot above shows how the error rate changes according to the number of trees. This allows to track how model accuracy improves (or doesn’t) as more trees are added. The black line represent the overall OOB (out-of-bag) error rate, while the other colored lines report the class specific error rates. From the plot, we can see that a plateau in the OOB error is reached with a number of trees around 520.
control <- trainControl(method='cv',
search='grid')
mtry <- ncol(train_data)
tunegrid <- expand.grid(mtry = seq(1, min(50, mtry), by = 5))
rf_gridsearch <- train(x=train_data, y=train_labels,
method = 'rf',
ntree=520,
metric = 'Accuracy',
tuneGrid = tunegrid,
trControl = control)
mtry <- rf_gridsearch$bestTune$mtry
The best mtry results to be 41. So, a model is built on the training set with these parameters and then evaluated on the test set.
set.seed(1)
tunegrid <- expand.grid(.mtry=c(mtry))
rf <- train(x=train_data, y=train_labels,
method = 'rf',
ntree=520,
metric = 'Accuracy',
tuneGrid = tunegrid,
trControl = control)
set.seed(1)
## Apply it on the test set
pred_rf <- predict(rf, test_data)
## Accuracy
acc_rf <- mean(pred_rf==test_labels)
## Confusion matrix
table(pred_rf, test_labels)
## test_labels
## pred_rf breast cancer control
## breast cancer 1 1
## control 4 6
## AUC from ROC curve
prob_rf <- predict(rf, test_data, type = "prob")
roc_rf <- roc(test_labels, prob_rf$control)
# plot(roc_rf)
auc_rf <- auc(roc_rf)
## Update df
res_df["RandomForest"] <- c(acc_rf, auc_rf)
rf <- randomForest(x=train_data, y=as.factor(train_labels), ntree = 520, mtry=mtry)
# Get importance values
rf_importance <- importance(rf)
# Extract probe IDs in order of importance
ordered_probes <- rownames(rf_importance)[order(rf_importance[, 1], decreasing = TRUE)]
# Ensure correct mapping exists
probe_to_symbol <- annotLookup[, c("affy_hg_u133_plus_2", "external_gene_name")]
colnames(probe_to_symbol) <- c("probe", "symbol")
probe_symbols <- setNames(probe_to_symbol$symbol, probe_to_symbol$probe)
top_vars <- head(ordered_probes, 30) # Show top 30 features
imp_df <- data.frame(
probe = top_vars,
importance = rf_importance[top_vars, 1],
gene = ifelse(top_vars %in% names(probe_symbols), probe_symbols[top_vars], top_vars)
)
ggplot(imp_df, aes(x = reorder(gene, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#2e005d") +
coord_flip() +
labs(title = "Top 30 Variable Importance (Random Forest)", x = "Gene", y = "Importance") +
theme_minimal(base_size = 12)
The plot shows the top 30 important variable according to random forest. The missing gene on the y-axis correspond to 221713_s_at. It is empty because it does not have a corresponding external gene name found in the annotation of biomaRt.
expr_data_t <- t(expr_data)
top_expr <- expr_data_t[top_vars, , drop = FALSE] # rows = probes, cols = samples
top_expr_df <- as.data.frame(top_expr)
top_expr_df$probe <- rownames(top_expr_df)
top_expr_long <- melt(top_expr_df, id.vars = "probe", variable.name = "sample", value.name = "expression")
# Add disease info
top_expr_long$disease <- metadata$disease.state.ch1[match(top_expr_long$sample, metadata$geo_accession)]
# map probes to gene symbols
top_expr_long$gene <- ifelse(top_expr_long$probe %in% names(probe_symbols),
probe_symbols[top_expr_long$probe],
top_expr_long$probe)
ggplot(top_expr_long, aes(x = disease, y = expression, fill = disease)) +
geom_boxplot(outlier.shape = NA) +
facet_wrap(~ gene, scales = "free_y", ncol = 5) +
theme_minimal(base_size = 12) +
labs(title = "Expression of Top 30 Genes by Disease State",
x = "Disease State", y = "Expression") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Plot showing how the expression of the top 30 genes changes in breast cancer and in the control.
# Select top N important probes
top_n <- 30
top_probes <- imp_df$probe[1:top_n]
heatmap_data <- normalized.log.ex[top_probes, ]
# Add gene symbols
gene_symbols <- imp_df$gene[1:top_n]
rownames(heatmap_data) <- gene_symbols
pheatmap(heatmap_data,
scale = "row", # normalize expression within each gene
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
show_rownames = TRUE,
show_colnames = TRUE,
fontsize_row = 8,
main = "Top Random Forest Genes Heatmap")
Heatmap of the most significant genes according to random forest.
# Ensure sample labels are aligned with expression data
sample_types <- metadata$type
names(sample_types) <- colnames(normalized.log.ex)
# Subset for current samples
sample_types <- sample_types[colnames(heatmap_data)]
# Set rownames to match sample IDs
rownames(metadata) <- metadata$geo_accession
# Create annotation dataframe
annotation_col <- data.frame(Type = metadata[colnames(heatmap_data), "specimen.ch1"])
rownames(annotation_col) <- colnames(heatmap_data)
# Convert to factor
annotation_col$Type <- factor(annotation_col$Type)
type_levels <- levels(annotation_col$Type)
palette_colors <- brewer.pal(n = length(type_levels), name = "Set2") # or "Dark2", "Paired", etc.
names(palette_colors) <- type_levels
ann_colors <- list(Type = palette_colors)
pheatmap(heatmap_data,
scale = "row",
annotation_col = annotation_col,
annotation_colors = ann_colors,
show_colnames = TRUE,
main = "Heatmap Top Random Forest Genes by Specimen")
Heatmap of the most significant genes according to random forest, showing the specimen.
# Convert group labels to factor
group_factor <- as.factor(metadata$disease.state.ch1)
# Perform t-tests
ttest <- genefilter::rowttests(as.matrix(t(expr_data)), group_factor)
ttest <- which(p.adjust(ttest$p.value)<0.1)
## Reduce the original dataset
ttest_data <- expr_data[,ttest]
#dim(ttest_data) #42 2
ttest_train <- ttest_data[train_index, ]
#dim(ttest_train) #30 2
set.seed(1)
## Fit the model with cv
control <- trainControl(method="cv", number=10)
lda_fit <- train(ttest_train, train_labels, method="lda",
metric="Accuracy", trControl=control)
print(lda_fit)
## Linear Discriminant Analysis
##
## 30 samples
## 2 predictor
## 2 classes: 'breast cancer', 'control'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 27, 27, 27, 26, 27, 28, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7583333 0.53
## Apply it on the test set
pred_lda <- predict(lda_fit, test_data)
## Accuracy
acc_lda <- mean(pred_lda==test_labels)
## AUC from ROC curve
pred_lda <- predict(lda_fit, test_data, type = "prob")
roc_lda <- roc(test_labels, pred_lda[,"breast cancer"])
auc_lda <- auc(roc_lda)
#plot(roc_rf)
auc_lda <- auc(roc_lda)
## Update results table
res_df["LDA"] <- c(acc_lda, auc_lda)
#print(lda_fit)
set.seed(1)
## fit Lasso with cv
control <- trainControl(method="cv", number=10)
tunegrid <- expand.grid(alpha=1,
lambda=seq(.0001,1,by=.001))
lasso_fit <- train(train_data, train_labels, method="glmnet", family="binomial",
tuneGrid = tunegrid,
metric="Accuracy", trControl=control)
#print(lasso_fit)
# Apply it on the test set
pred_lasso <- predict(lasso_fit, test_data)
## Accuracy
acc_lasso <- mean(pred_lasso==test_labels)
pred_lasso <- predict(lasso_fit, test_data, type = "prob")
roc_lasso <- roc(test_labels, pred_lasso[,"breast cancer"])
auc_lasso <- auc(roc_lasso)
## Update results table
res_df["Lasso"] <- c(acc_lasso, auc_lasso)
results <- lasso_fit$results
ggplot(results, aes(x = lambda, y = Accuracy)) +
geom_line(color = "steelblue") +
geom_point(color = "darkred") +
labs(title = "Lasso: Accuracy vs. Lambda",
x = expression(lambda),
y = "Accuracy") +
theme_minimal()
Plot showing how LASSO accuracy changes according to the lambda parameter chosen.
best_lambda <- lasso_fit$bestTune$lambda
print(best_lambda)
## [1] 0.2021
The goal of this method is to avoid batch effects and obtain a method that is repeatable and reproducible.
This method compares signatures of different individuals. Each signature is sorted for the expression value. The most important genes for each signature are the most and the less expressed. Then signatures are compared and a map is constructed, where clusters can be seen if they are closely connected.
set.seed(1)
## Apply SCUDO on the training set
scudo_train <- scudoTrain(t(train_data), groups = as.factor(train_labels),
nTop = 25, nBottom = 25, alpha = 0.05)
scudo_train
## Object of class ScudoResults
## Result of scudoTrain
##
## Number of samples : 30
## Number of groups : 2
## breast cancer : 13 samples
## control : 17 samples
## upSignatures length : 25
## downSignatures length : 25
## Fold-changes : computed
## grouped : No
## Feature selection : performed
## Test : Wilcoxon rank sum test
## p-value cutoff : 0.05
## p.adjust method : none
## Selected features : 1489
## perform validation using testing samples
scudo_test <- scudoTest(scudo_train, t(test_data), as.factor(test_labels),
nTop = 25, nBottom = 25)
## Plot the results
testNet <- scudoNetwork(scudo_test, N = 0.2)
scudoPlot(testNet, vertex.label = NA)
SCUDO network
## Classification
pred_scudo <- scudoClassify(t(train_data), t(test_data),
nTop = 25, nBottom = 25, N = 0.2,
trainGroups = as.factor(train_labels),
featureSel = FALSE)
## Accuracy
acc_scudo <- mean(pred_scudo$predicted==test_labels)
## AUC from ROC curve
roc_scudo <- roc(test_labels, pred_scudo$scores[,2])
# plot(roc_rf)
auc_scudo <- auc(roc_scudo)
## Update results table
res_df["Scudo"] <- c(acc_scudo, auc_scudo)
The models fitted before are compared by accuracy and AUC.
res_df <- as.data.frame(res_df)
res_df$metric <- rownames(res_df)
res_long <- pivot_longer(res_df,
cols = -metric,
names_to = "Model",
values_to = "Value")
#order models by AUC
res_long$Model <- factor(res_long$Model,
levels = res_long %>%
filter(metric == "AUC") %>%
arrange(desc(Value)) %>%
pull(Model))
# Plot with value labels
ggplot(res_long, aes(x = Model, y = Value, fill = metric)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
geom_text(aes(label = round(Value, 2)),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 3) +
scale_fill_manual(values = c("Accuracy" = "#1b9e77", "AUC" = "#d95f02")) +
labs(title = "Model Performance: Accuracy vs AUC",
x = "Model",
y = "Metric Value",
fill = "Metric") +
ylim(0, 1.1) + # add a little headroom for the labels
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Comparison of the models’ performances.
The functional enrichment analysis is performed with
gprofiler2, with the objective of finding out the molecular
functions more represented among the most important genes. For this
purpose, the genes with highest importance score according to the random
forest model fitted before are considered.
geneList <- head(ordered_probes, 30)
gost_res <- gost(query = geneList,
organism = "hsapiens",
ordered_query = FALSE, multi_query = FALSE, significant = FALSE,
exclude_iea = FALSE, measure_underrepresentation = FALSE, evcodes = FALSE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL, numeric_ns = "",
sources = NULL, as_short_link = FALSE)
gost_df <- gost_res$result[c("term_id", "term_name", "p_value", "significant")]
gost_df <- gost_df[order(gost_df$p_value),]
gost_df <- gost_df[which(gost_df$significant==TRUE),]
# head(gost_df)
# visualize results using a Manhattan plot
p <- gostplot(gost_res, capped = TRUE, interactive = FALSE)
publish_gostplot(p, filename = NULL)
Results of the functional enrichment analysis
publish_gosttable(gost_res, highlight_terms = gost_df$term_id[1:10],
use_colors = TRUE, filename = NULL,
show_columns = c("term_id", "term_name", "p_value"))
Results of the functional enrichment analysis
The network analysis is performed with pathfindR, using
the KEGG, Gene Ontology and Reactome
databases. For this analysis, the 100 genes with the lowest p-value
obtained with a t-test are taken into consideration. This list of genes
has 23 genes in common with the one of the most important genes used for
the functional enrichment.
# Map probe IDs to Ensembl gene IDs
probe_to_ensembl <- setNames(annotLookup$ensembl_gene_id, annotLookup$affy_hg_u133_plus_2)
new_colnames <- probe_to_ensembl[colnames(expr_data)]
# Warn if there are unmatched probe IDs
unmatched <- is.na(new_colnames)
if (any(unmatched)) {
warning(sum(unmatched), " probe IDs in expr_data were not found in annotLookup and will be removed.")
}
expr_data_new <- expr_data[, !unmatched]
colnames(expr_data_new) <- new_colnames[!unmatched]
# duplicated Ensembl IDs
expr_t <- t(expr_data_new)
expr_df <- data.frame(expr_t, ensembl_id = colnames(expr_data_new))
collapsed <- aggregate(. ~ ensembl_id, data = expr_df, FUN = mean)
rownames(collapsed) <- collapsed$ensembl_id
collapsed$ensembl_id <- NULL
expr_data_collapsed <- t(collapsed)
# Run t-test across two groups
tt <- genefilter::rowttests(as.matrix(t(expr_data_collapsed)), group_factor)
# Sort by p-value
tt <- tt[order(tt$p.value), ]
# Extract top 100 genes by p-value
top_n <- 100
geneList_tt <- data.frame(
ENSEMBL = rownames(tt)[1:top_n],
p.value = tt$p.value[1:top_n]
# p.adjust = p.adjust(tt$p.value[1:top_n])
)
# Remove any trailing version numbers in Ensembl IDs (e.g., ENSG000001234.1 -> ENSG000001234)
gene_ids_split <- unlist(strsplit(as.character(geneList_tt$ENSEMBL), ".", fixed = TRUE))
geneList_tt$ENSEMBL <- gene_ids_split[startsWith(gene_ids_split, "ENS")]
# Annotate Ensembl IDs with gene symbols
gene_symbol_df <- bitr(geneList_tt$ENSEMBL,
fromType = "ENSEMBL",
toType = "SYMBOL",
OrgDb = org.Hs.eg.db)
geneList_tt <- merge(gene_symbol_df, geneList_tt)
# dim(geneList_tt) #105 3
net_KEGG <- run_pathfindR(
geneList_tt[c("SYMBOL", "p.value")],
iterations = 1,
gene_sets = "KEGG",
silent_option = FALSE
)
net_GO <- run_pathfindR(
geneList_tt[c("SYMBOL", "p.value")],
iterations = 1,
gene_sets = "GO-All",
silent_option = FALSE
)
net_Reactome <- run_pathfindR(geneList_tt[c("SYMBOL", "p.value")],
iterations = 1,
gene_sets = "Reactome",
silent_option = FALSE)
enrichment_chart(net_KEGG)
Results of the biological network analysis
enrichment_chart(net_GO)
Results of the biological network analysis
enrichment_chart(net_Reactome)
Results of the biological network analysis
term_gene_graph(net_Reactome, num_terms = 7, use_description = TRUE)
term_gene_graph(net_KEGG, num_terms = 7, use_description = TRUE)
term_gene_graph(net_GO, num_terms = 7, use_description = TRUE)
## cluster enriched terms
cluster_enriched_terms(net_Reactome)
## cluster enriched terms
cluster_enriched_terms(net_GO)
## cluster enriched terms
cluster_enriched_terms(net_KEGG)